BlekkoURLListTransfer.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
package org.commoncrawl.tools;

import java.io.BufferedReader;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.Semaphore;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PositionedReadable;
import org.apache.hadoop.fs.Seekable;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.apache.hadoop.util.Progressable;
import org.commoncrawl.util.CCStringUtils;

import com.amazonaws.AmazonServiceException;
import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.services.s3.AmazonS3Client;
import com.amazonaws.services.s3.model.ObjectListing;
import com.amazonaws.services.s3.model.ObjectMetadata;
import com.amazonaws.services.s3.model.S3ObjectSummary;
import com.google.common.io.CountingInputStream;

/**
 * Hacked together utility to breakup the bulk Blekko URL list file into smaller SequenceFile
 * chunks and push them up to S3. 
 *   
 * @author rana
 */
public class BlekkoURLListTransfer {

  public static final Log LOG = LogFactory.getLog(BlekkoURLListTransfer.class);
  
  static Options options = new Options();
  static { 
    
    options.addOption(
        OptionBuilder.withArgName("awsKey").hasArg().withDescription("AWS Key").isRequired().create("awsKey"));
    
    options.addOption(
        OptionBuilder.withArgName("awsSecret").hasArg().withDescription("AWS Secret").isRequired().create("awsSecret"));

    options.addOption(
        OptionBuilder.withArgName("s3bucket").hasArg().withDescription("S3 bucket name").isRequired().create("s3bucket"));
    
    options.addOption(
        OptionBuilder.withArgName("s3path").hasArg().withDescription("S3 path prefix").isRequired().create("s3path"));
    
    options.addOption(
        OptionBuilder.withArgName("input").hasArg().withDescription("Input URL List").isRequired().create("input"));

  }
  
  static void printUsage() { 
    HelpFormatter formatter = new HelpFormatter();
    formatter.printHelp( "BlekkoURLListTransfer", options );
  }  

  private static final String IN_MEMORY_FS_URI = "imfs://localhost/";

  private static final int HOLDING_BUFFER_SIZE = 1 << 16;
  private static final int SCAN_BUFFER_SIZE = HOLDING_BUFFER_SIZE/4; // 1 << 20;
  private static final int CANNED_FILE_COMPRESSED_BLOCK_SIZE = 1 << 20; // 1 MB
  private static final int CANNED_FILE_SIZE = CANNED_FILE_COMPRESSED_BLOCK_SIZE * 100;
  private static final int CANNED_FILE_SIZE_PAD = CANNED_FILE_COMPRESSED_BLOCK_SIZE  * 10;
  private static final Path CANNED_FILE_PATH = new Path("/tmp/cannedFile");
  
  private static final String COMPLETION_FILE_SUFFIX = "COMPLETE";
 
  private static class InMemoryFSHack extends FileSystem {

    static class CustomInputStream extends DataInputBuffer implements PositionedReadable, Seekable {
      
      int offset;
      int len;
      
      public CustomInputStream(byte[] data,int offset,int length) { 
        this.offset = offset;
        this.len = length;
        super.reset(data,offset, length);
      }
      
      @Override
      public void readFully(long position, byte[] buffer) throws IOException {
        read(position,buffer,0,buffer.length);
      }
      
      @Override
      public void readFully(long position, byte[] buffer, int offset, int length)
          throws IOException {
        read(position,buffer,offset,length);
      }
      
      @Override
      public int read(long position, byte[] buffer, int offset, int length)
          throws IOException {
        System.arraycopy(this.getData(), (int)position, buffer, 0, buffer.length);
        return length;
      }

      @Override
      public void seek(long pos) throws IOException {
        super.reset(getData(),offset+(int)pos,len-(int)pos);
      }

      @Override
      public long getPos() throws IOException {
        return super.getPosition();
      }

      @Override
      public boolean seekToNewSource(long targetPos) throws IOException {
        return false;
      }
      
    }
    
    InMemoryFSHack(Configuration conf) { 
      setConf(conf);
    }
    
    DataOutputBuffer outputStream = new DataOutputBuffer(CANNED_FILE_SIZE + CANNED_FILE_SIZE_PAD);
    
    @Override
    public URI getUri() {
      try {
        return new URI(IN_MEMORY_FS_URI);
      } catch (URISyntaxException e) {
        return null;
      }
    }

    @Override
    public FSDataInputStream open(Path f, int bufferSize) throws IOException {
      CustomInputStream inputStream = new CustomInputStream(outputStream.getData(),0,outputStream.getLength());
      inputStream.reset(outputStream.getData(),0,outputStream.getLength());
      return new FSDataInputStream(inputStream);
    }

    @Override
    public FSDataOutputStream create(Path f, FsPermission permission,
        boolean overwrite, int bufferSize, short replication, long blockSize,
        Progressable progress) throws IOException {
      outputStream.reset();
      
      return new FSDataOutputStream(outputStream, null);
    }

    @Override
    public FSDataOutputStream append(Path f, int bufferSize,
        Progressable progress) throws IOException {
      return null;
    }

    @Override
    public boolean rename(Path src, Path dst) throws IOException {
      // TODO Auto-generated method stub
      return false;
    }

    @Override
    @Deprecated
    public boolean delete(Path f) throws IOException {
      return false;
    }

    @Override
    public boolean delete(Path f, boolean recursive) throws IOException {
      outputStream.reset();
      return true;
    }

    public DataOutputBuffer swapBuffers() { 
      DataOutputBuffer out = outputStream;
      outputStream = new DataOutputBuffer(CANNED_FILE_SIZE + CANNED_FILE_SIZE_PAD);
      return out;
    }
    @Override
    public FileStatus[] listStatus(Path f) throws IOException {
      return null;
    }

    @Override
    public void setWorkingDirectory(Path new_dir) {      
    }

    @Override
    public Path getWorkingDirectory() {
      return null;
    }

    @Override
    public boolean mkdirs(Path f, FsPermission permission) throws IOException {
      return false;
    }

    @Override
    public FileStatus getFileStatus(Path f) throws IOException {
      return new FileStatus(outputStream.getLength(),false,1,1,1,CANNED_FILE_PATH);
    } 
  }
  
  private static long readWriteNextLine(CountingInputStream is,ByteBuffer inputBuffer,DataOutputBuffer outputBuffer,SequenceFile.Writer writer)throws IOException { 
  
    outputBuffer.reset();
    
    for (;;) {

      if (inputBuffer.remaining() == 0) {
        int bytesRead = is.read(inputBuffer.array());
        if (bytesRead == -1) { 
          throw new EOFException();
        }
        else { 
          inputBuffer.clear();
          inputBuffer.limit(bytesRead);
        }

      }
      
      int scanStartPos = inputBuffer.position();
      boolean eos=false;
      while (inputBuffer.remaining() != 0) {
        byte nextChar = inputBuffer.get();
        if ((nextChar == '\n') || (nextChar == '\r')) {
          eos=true;
          break;
        }
      }
      // put whatever we read into the output buffer .. .
      outputBuffer.write(inputBuffer.array(),scanStartPos,inputBuffer.position() - scanStartPos);
      
      if (eos) { 
        break;
      }
    }
    String line = new String(outputBuffer.getData(),0,outputBuffer.getLength(),Charset.forName("UTF-8"));
    int spaceDelimiter = line.indexOf(' ');
    if (spaceDelimiter != -1 && spaceDelimiter < line.length() - 1) { 
      String url = line.substring(0, spaceDelimiter);
      String metadata = line.substring(spaceDelimiter+1);
      if (url.length() != 0 && metadata.length() != 0) {
        writer.append(new Text(url), new Text(metadata));
        // System.out.println("URL:" + url + " Metadata:" + metadata);
      }
    }
    return is.getCount() + inputBuffer.position();
  }
  
  
  private static boolean scanForCompletionFile(AmazonS3Client s3Client,String s3Bucket,String s3Path)throws IOException {
    String finalPath = s3Path +  COMPLETION_FILE_SUFFIX;
    
    try { 
      s3Client.getObjectMetadata(s3Bucket,finalPath);
      return true;
    }
    catch (AmazonServiceException e) { 
      if (e.getStatusCode() == 404) { 
        return false;
      }
      else { 
        throw new IOException(e);
      }
    }
  }
  
  private static Pattern seqFilePattern = Pattern.compile(".*/([0-9]*)\\.seq");
  
  private static SequenceFile.Writer flushFile(InMemoryFSHack fs,Configuration conf, Uploader uploader,String s3Bucket,String s3FolderPath,long lastValidReadPos, SequenceFile.Writer writer) throws IOException {
    
    writer.close();
    
    String fullS3Path =s3FolderPath + Long.toString(lastValidReadPos) + ".seq";
    
    // ok detach the buffer 
    DataOutputBuffer bufferOut = fs.swapBuffers();
    
    DataInputBuffer inputStream = new DataInputBuffer();
    inputStream.reset(bufferOut.getData(),0,bufferOut.getLength());
    
    QueueItem queueItem = new QueueItem(s3Bucket,fullS3Path,inputStream);

    LOG.info("Queueing for Upload File:" + fullS3Path + "of size:" + fs.getFileStatus(CANNED_FILE_PATH).getLen() + " to S3");
    try {
      uploader.queue.put(queueItem);
    } catch (InterruptedException e) {
    }
    LOG.info("Queued for Upload File:" + fullS3Path + "of size:" + fs.getFileStatus(CANNED_FILE_PATH).getLen() + " to S3");
        
    return createWriter(fs, conf);
  }
  
  private static long scanForLastValidOffset(AmazonS3Client s3Client,String s3Bucket,String s3Path) throws IOException { 
    ObjectListing listing = s3Client.listObjects(s3Bucket,s3Path);
    boolean done = false;

    long lastValidOffsetOut = 0L;
    do { 
      for (S3ObjectSummary summary : listing.getObjectSummaries()) {
        Matcher seqFileMatcher = seqFilePattern.matcher(summary.getKey());
        if (seqFileMatcher.matches()) { 
          lastValidOffsetOut = Math.max(lastValidOffsetOut,Long.parseLong(seqFileMatcher.group(1)));
        }
      }
      if (listing.isTruncated()) { 
        listing = s3Client.listNextBatchOfObjects(listing);
      }
      else { 
        done = true;
      }
    }
    while (!done);
    
    return lastValidOffsetOut;
  }
  
  private static SequenceFile.Writer createWriter(FileSystem fs,Configuration conf) throws IOException { 
    return SequenceFile.createWriter(fs,conf,CANNED_FILE_PATH,Text.class,Text.class,CompressionType.BLOCK,new SnappyCodec());
  }
  
  public static class QueueItem {
    String bucket;
    String path;
    DataInputBuffer payload;
    
    public QueueItem() { 
      
    }
    
    public QueueItem(String bucket,String path,DataInputBuffer payload) { 
      this.bucket = bucket;
      this.path = path;
      this.payload = payload;
    }
  }
  
  public static class Uploader {

    static final int MAX_BACKLOG_SIZE  = 15;
    static final int UPLOADER_THREAD_COUNT = 10;
    
    LinkedBlockingQueue<QueueItem> queue = new LinkedBlockingQueue<QueueItem>(MAX_BACKLOG_SIZE);
    Thread threads[] = new Thread[UPLOADER_THREAD_COUNT];
    AmazonS3Client s3Client;
    Semaphore runningWaitSemaphore = new Semaphore( - (UPLOADER_THREAD_COUNT -1));
    
    public Uploader(String awsAccessKey,String awsSecret) throws IOException { 
      BasicAWSCredentials credentials 
        = new BasicAWSCredentials(
        awsAccessKey,awsSecret);
      // create the client ... 
      s3Client = new AmazonS3Client(credentials);
      
      for (int i=0;i<UPLOADER_THREAD_COUNT;++i) {
        
        // closure the thread index ... 
        final int threadIndex = i;
        threads[threadIndex] = new Thread(new Runnable() {
  
          @Override
          public void run() {
            try { 
              while (true) { 
                try { 
                  QueueItem item = queue.take();
                  
                  if (item.payload == null) { 
                    LOG.info("UPLOADER_THREAD[" + threadIndex + "]:Received NULL Queue Item. Exiting");
                    break;
                  }
                  else {
                    
                    boolean done = false;
                    int retryCount = 0;
    
                    while (!done) {
                      
                      try { 
                        long flushStartTime = System.currentTimeMillis();
                        ObjectMetadata metadata = new ObjectMetadata();
                        metadata.setContentLength(item.payload.getLength());
                        s3Client.putObject(item.bucket, item.path, item.payload,metadata);
                        long flushEndTime = System.currentTimeMillis();
                        LOG.info("UPLOADER_THREAD[" + threadIndex + "]: Flushing Finished for File:" + item.path + "of size:" + item.payload.getLength() + " Took:" + (flushEndTime-flushStartTime));
                        done = true;
                      }
                      catch (Exception e) { 
                        LOG.error("UPLOADER_THREAD[" + threadIndex + "]: Exception While Flusing File:" + item.path + " of size:" + item.payload.getLength() 
                            + " Exception:" + CCStringUtils.stringifyException(e) + " RetryCount:" + retryCount);
                        
                        ++retryCount;
                      }
                    }
                  }
                }
                catch (InterruptedException e) { 
                  
                }
              }
              
              LOG.info("UPLOADER_THREAD[" + threadIndex + "]: DONE");
            }
            finally { 
            runningWaitSemaphore.release();
            }
          }
        });
        threads[threadIndex].start();
      }
    }
  }
  
  public static void main(String[] args) {
    CommandLineParser parser = new GnuParser();
    
    try { 
      // parse the command line arguments
      CommandLine cmdLine = parser.parse( options, args );
 
      BasicAWSCredentials credentials 
        = new BasicAWSCredentials(
          cmdLine.getOptionValue("awsKey"),
          cmdLine.getOptionValue("awsSecret"));
      
      // create the client ... 
      AmazonS3Client s3Client = new AmazonS3Client(credentials);
      
      // create uploader thread ... 
      Uploader uploader = new Uploader(cmdLine.getOptionValue("awsKey"), cmdLine.getOptionValue("awsSecret"));
      
      // get length of input file ...  
      File inputFile = new File(cmdLine.getOptionValue("input"));
      
      // allocate in memory file system  
      Configuration conf = new Configuration();
      conf.setInt("io.seqfile.compress.blocksize",CANNED_FILE_COMPRESSED_BLOCK_SIZE);
      
      InMemoryFSHack fsHack = new InMemoryFSHack(conf);
      DataOutputBuffer outputBuffer = new DataOutputBuffer(HOLDING_BUFFER_SIZE);
      
      // get bucket and input path parameters 
      String s3bucket = cmdLine.getOptionValue("s3bucket");
      String s3path   = cmdLine.getOptionValue("s3path");
      // scan for completion marker ... 
      if (scanForCompletionFile(s3Client,s3bucket,s3path) == false) { 
        // scan existing files to find last decompressed offset ...
        long lastReadPos = scanForLastValidOffset(s3Client,s3bucket,s3path);
        LOG.info("Last Valid Read Pos:" + lastReadPos);
        // open input stream ... 
        CountingInputStream countingInputStream = new CountingInputStream(new FileInputStream(inputFile));
        // setup inflater ... 
        LOG.info("Initializing GZIP Stream for File at:" + inputFile);
        GZIPInputStream inflater = new GZIPInputStream(countingInputStream,SCAN_BUFFER_SIZE);
        // init counting stream to wrap inflater 
        CountingInputStream countingDecompressedStream = new CountingInputStream(inflater);
        // skip to last scan offset 
        inflater.skip(lastReadPos);
        
        ByteBuffer scanBuffer = ByteBuffer.allocate(SCAN_BUFFER_SIZE);
        boolean eof = false;
        
        //read input file, collecting lines into buffer ...
        long lineCount = 0;
        
        // create sequence file ... 
        SequenceFile.Writer writer = createWriter(fsHack, conf);
        
        while (!eof) { 
          
          try { 
            lastReadPos = readWriteNextLine(countingDecompressedStream, scanBuffer, outputBuffer,writer);
            ++lineCount;
            if (lineCount % 10000 == 0) { 
              LOG.info("Read 10000 lines RAW Pos:" + countingInputStream.getCount() + " lastReadPos:" + lastReadPos + " TotalLines:" + lineCount);
            }
          }
          catch (EOFException e) { 
            LOG.info("HIT EOF AT Raw Pos:" + countingInputStream.getCount() + " lastReadPos:" + lastReadPos);
            eof = true;
          }
          
          // once our buffer flush threshold is hit or if eof .. .
          if (eof || writer.getLength() >= CANNED_FILE_SIZE) { 
            // flush buffer to s3
            writer = flushFile(fsHack,conf,uploader,s3bucket,s3path,lastReadPos,writer);
            // reset output buffer 
            outputBuffer.reset();
          }
        }
        
        LOG.info("Done Processing Data. Queueing Empty Item");
        uploader.queue.put(new QueueItem());
        
        LOG.info("Waiting for Uploader Threads to Die");
        uploader.runningWaitSemaphore.acquireUninterruptibly();
        LOG.info("Uploader Thread Dead. Exiting");
      }
    }
    catch (ParseException e) { 
      System.out.println("Error parsing command line:" + e.getMessage());
    }
    catch( Exception exp ) {
      // oops, something went wrong
      LOG.error(CCStringUtils.stringifyException(exp));
      
      printUsage();
    }    
  }
}